# Kernel: sgemm_kernel_64
#
# SharedSize: 8192
# Params(8):
#   0:0x140:4:4 param_C,
#   1:0x144:4:0 param_m,
#   2:0x148:4:0 param_n,
#   3:0x14c:4:0 param_k,
#   4:0x150:4:0 param_lda,
#   5:0x154:4:0 param_ldb,
#   6:0x158:4:0 param_ldc
#   7:0x15c:4:0 param_alpha
#   8:0x160:4:4 param_D // for diagnostic printf output
#
# Globals:
#   c[0x0][0x164]: texA (the value is 1)
#   c[0x0][0x168]: texB (the value is 0)

<REGISTER_MAPPING>

    0-63    ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end

    80      : zOffset
    0-63    : cz<00-63>

     3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35>
     7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35>
     1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35>
     5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35>
    35,34,43,42,51,50,59,58 : cx32y<00-03|32-35>
    39,38,47,46,55,54,63,62 : cx33y<00-03|32-35>
    33,32,41,40,49,48,57,56 : cx34y<00-03|32-35>
    37,36,45,44,53,52,61,60 : cx35y<00-03|32-35>

    64-79   : j0Ax<00-03|32-35>, j0By<00-03|32-35>
    80-95   : j1Ax<00-03|32-35>, j1By<00-03|32-35>

    64-71   : cs<0-7>

    96-111  : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3>

    112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32

    72-111  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX

</REGISTER_MAPPING>

--:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
--:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
--:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3

<SCHEDULE_BLOCK>

// blk = tid >= 32 ? by   : bx;
// ldx = tid >= 32 ? ldb  : lda;
// tex = tid >= 32 ? texB : texA;
01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1
06:-:-:-:1      SEL blk, by, bx, P0;              // Wait Dep 2 & 3
--:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
--:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
--:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
--:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB

--:-:-:-:1      LOP.AND zOffset, tid, -32;
--:-:-:-:1      STS.128 [zOffset + 4x<16*64>], RZ;

// tid2   = (tid >> 4) & 1
// tid15  = tid & 15
// tid31 = tid & 31
// tid32 = tid & 32
--:-:-:-:1      BFE.U32 tid2,  tid, 0x104; // 1 bit at position 4
--:-:-:-:1      LOP.AND tid15, tid, 15;
--:-:-:-:1      LOP.AND tid31, tid, 31;
--:-:-:-:1      LOP.AND tid32, tid, 32;

// ldx4  = ldx * 4;
// ldx8  = ldx * 8;
--:-:-:-:1      SHR.U32 ldx, ldx4, 2;
--:-:-:-:1      IADD ldx8, ldx4, ldx4;

// track0 = blk*64/4 + tid15 + (ldx * tid2)
--:-:-:-:1      ISCADD  track0, blk, tid15, 4;
--:-:-:-:1      XMAD.LO track0, ldx, tid2,  track0, xmad_t0;
--:-:-:-:1      IADD3 track2, track0, ldx, ldx;
--:-:-:-:1      IADD track4, track0, ldx4;
--:-:-:-:1      IADD track6, track2, ldx4;

// writeS = tid15*4*4 + tid2*64*4
--:-:-:-:1      SHL    tid15_4, tid15, 4;
--:-:-:-:1      ISCADD writeS, tid2, tid15_4, 8;

// writeS += 2048 if tid >= 32
--:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*64>;

// int end = track0 + (k-8)*ldx;
--:-:-:-:1      MOV k, c[0x0][0x14c];
--:-:-:-:1      IADD k, k, -8;
--:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;

// readAs = ((tid >> 1) & 7) << 4;
--:-:-:-:1      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      SHL     readAs, readAs, 4;

// readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048;
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readBs, tid,    0x30;
--:-:-:-:1      SHR.U32 readBs, readBs, 3;
--:-:-:-:1      LOP.OR  readBs, readBs, tid1;
--:-:-:-:1      ISCADD  readBs, readBs, 4x<8*64>, 4;

<ORDERED>
--:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
--:-:2:-:1      TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
--:-:3:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1
--:-:4:-:1      TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2
</ORDERED>

</SCHEDULE_BLOCK>

<CODE>
    return join '', map sprintf("--:-:5:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15;
</CODE>

<SCHEDULE_BLOCK>

01:-:-:-:1      STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1
02:-:-:-:1      STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2
04:-:-:-:1      STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
08:-:-:-:1      STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4

--:-:-:-:1      IADD track0, track0, ldx8;
--:-:-:-:1      IADD track2, track2, ldx8;
--:-:-:-:1      IADD track4, track4, ldx8;
--:-:-:-:1      IADD track6, track6, ldx8;

10:-:-:-:5      BAR.SYNC 0;

</SCHEDULE_BLOCK>

--:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*64>;

--:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
--:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
--:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
--:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1

// Efficiency:
// ffma: 512
// lds:  32 dual issued
// sts:  4  dual issued
// tex:  4  dual issued
// add:  4
// xor:  3
// setp: 1
// bar:  1  dual issued
// bra:  1  dual issued
// Total: 520 (512/520 = 98.5% FFMA)

LOOP:

// Loop end condition
--:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;

<CODE>

    my @cOrder;
    my @swirl = ([2,0],[2,1],[0,1],[0,0]);
    my @x = (0,1,32,33);
    foreach my $y (0,2,32,34)
    {
        foreach my $x (@x)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @x = reverse @x;
    }

    my %insert =
    (
        j0c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n",
        j0c33 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n",

        j1c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n",
        j1c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n",

        j5c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n",
        j5c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n",

        j6c30 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n",
        j6c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n",

        j6c62 =>
                "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
                "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" .
                "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" .
                "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n",

        j7c63 =>
                "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
                "--:-:-:-:1  \@P0 IADD track2, track2, ldx8;\n" .
                "--:-:-:-:1  \@P0 IADD track4, track4, ldx8;\n" .
                "--:-:-:-:0  \@P0 IADD track6, track6, ldx8;\n" .
                "--:-:-:Y:5  \@P0 BRA LOOP;\n",
    );

    my $out;
    foreach my $j (0 .. 7)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 8;
        my $rsPred   = $j == 7 ? '@P0' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $yield  = $c == 32 ? 'Y' : '-';

            my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');

            my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
        }
    }
    return $out;

</CODE>

<SCHEDULE_BLOCK>

--:-:-:-:1      LOP.AND readAs, readAs, 0x7ff;
--:-:-:-:1      LOP.AND readBs, readBs, 0x7ff;

// writeCs = (readBs / 4) * 64 + readAs;
--:-:-:-:1      ISCADD  writeCs, readBs, readAs, 4;

// readCs = ((tid32 << 3) + tid31) << 2;
--:-:-:-:1      ISCADD  readCs, tid32,  tid31, 3;
--:-:-:-:1      SHL     readCs, readCs, 2;

// cx = bx*64 + tid31;
--:-:-:-:1      ISCADD  cx, bx, tid31, 6;

// cy = by*64 + (tid32 >> 1)
--:-:-:-:1      SHR.U32 cy00, tid32, 1;
--:-:-:-:1      ISCADD  cy00, by, cy00, 6;

// C += (cy*ldc + cx) * 4;
--:-:-:-:1      MOV ldc, c[0x0][0x158];
--:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
--:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;

--:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
--:-:-:-:1      IADD cx, cx, 32;
--:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m

// D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
// D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
//--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
//--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
//--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmadD;
//--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmadD;
//--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5

//--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
//--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
//--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
//--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
//--:-:-:-:1      STG.CS [D + 4x<4>], cx;
//--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
//--:-:-:-:1      STG.CS [D + 4x<6>], ci;
//--:-:-:-:1      STG.CS [D + 4x<7>], cx35y35;

--:-:-:-:1      IADD cy00, cy00, -1;
--:-:-:-:1      IADD cy04, cy00,  4;
--:-:-:-:1      IADD cy08, cy00,  8;
--:-:-:-:1      IADD cy12, cy00,  12;

--:-:-:-:1      SHL  ldc1,  ldc, 2;
--:-:-:-:1      SHL  ldc4,  ldc, 4;
--:-:-:-:1      SHL  ldc8,  ldc, 5;
--:-:-:-:1      ISCADD ldc28, ldc, -ldc4, 7;

--:-:-:-:1      MOV alpha, c[0x0][0x15c];
--:-:-:-:1      FMUL cs0, cx00y00, alpha;
--:-:-:-:1      FMUL cs1, cx01y00, alpha;
--:-:-:-:1      FMUL cs2, cx02y00, alpha;
--:-:-:-:1      FMUL cs3, cx03y00, alpha;
--:-:-:-:1      FMUL cs4, cx32y00, alpha;
--:-:-:-:1      FMUL cs5, cx33y00, alpha;
--:-:-:-:1      FMUL cs6, cx34y00, alpha;
--:-:-:-:1      FMUL cs7, cx35y00, alpha;

--:-:-:-:1      IADD Cy00, Cy00, -ldc1;
--:-:-:-:1      IADD Cy04, Cy00, ldc4;
--:-:-:-:1      IADD Cy08, Cy00, ldc8;
--:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)

</SCHEDULE_BLOCK>

<CODE>

    my $out;
    foreach my $y (0..3, 32..35)
    {
        my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2');

        $out .=
            "--:-:-:-:1      IADD cy00, cy00, 28;\n" .
            "--:-:-:-:1      IADD cy04, cy04, 28;\n" .
            "--:-:-:-:1      IADD cy08, cy08, 28;\n" .
            "--:-:-:-:1      IADD cy12, cy12, 28;\n\n" .

            "02:-:-:-:1      IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" .
            "--:-:-:-:1      IADD Cy04, Cy04, ldc28;\n" .
            "--:-:-:-:1      IADD Cy08, Cy08, ldc28;\n" .
            "--:-:-:-:1      IADD Cy12, Cy12, ldc28;\n\n"  if $y == 32;

        $out .= sprintf(
            "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
            "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
            "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
            "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
            "--:-:-:-:1      FMUL cs4, cx32y%02d, alpha;\n" .
            "--:-:-:-:1      FMUL cs5, cx33y%02d, alpha;\n" .
            "--:-:-:-:1      FMUL cs6, cx34y%02d, alpha;\n" .
            "--:-:-:-:0      FMUL cs7, cx35y%02d, alpha; // Dual Issue\n",
            $wait, $y, $comment, ($y) x 7) if $y;

        $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

--:-:-:-:5      EXIT;

STORE_C:

<SCHEDULE_BLOCK>

--:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
--:-:-:-:1      STS.128 [writeCs+4x<32>], cs4;

--:-:-:-:1      LDS cs0, [readCs + 4x<0*64 + 00>];
--:-:-:-:1      LDS cs1, [readCs + 4x<0*64 + 32>];
--:-:-:-:1      LDS cs2, [readCs + 4x<1*64 + 00>];
--:-:-:-:1      LDS cs3, [readCs + 4x<1*64 + 32>];
--:-:-:-:1      LDS cs4, [readCs + 4x<2*64 + 00>];
--:-:-:-:1      LDS cs5, [readCs + 4x<2*64 + 32>];
--:-:-:-:1      LDS cs6, [readCs + 4x<3*64 + 00>];
--:-:1:-:1      LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1

--:-:-:-:1      IADD cy00, cy00, 1;
--:-:-:-:1      IADD cy04, cy04, 1;
--:-:-:-:1      IADD cy08, cy08, 1;
--:-:-:-:1      IADD cy12, cy12, 1;

--:-:-:-:1      IADD Cy00, Cy00, ldc1;
--:-:-:-:1      IADD Cy04, Cy04, ldc1;
--:-:-:-:1      IADD Cy08, Cy08, ldc1;
--:-:-:-:1      IADD Cy12, Cy12, ldc1;

--:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
--:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m
--:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
--:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m

01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
--:-:-:-:1  @P1 STG.CG [Cy00 + 4x<32>], cs1;
--:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
--:-:-:-:1  @P3 STG.CG [Cy04 + 4x<32>], cs3;

--:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
--:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m
--:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
--:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m

--:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
--:-:-:-:1  @P1 STG.CG [Cy08 + 4x<32>], cs5;
--:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
--:2:-:-:1  @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2

</SCHEDULE_BLOCK>

--:-:-:-:5      RET;

